# First I have imported the libraries and data
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from pandas.plotting import parallel_coordinates
from plotly.subplots import make_subplots
df = pd.read_csv("players_list.csv")
df.head()
| Unnamed: 0 | Player | Match_played | Starts | Minutes_played | Gls | Ast | Shots | shots_on_target | goals_per_shot | ... | penalty_kick_won | penalty_kick_conceded | own_goal | aerial_duo_won | aerial_duo_lost | Pos | Age | Club_country | Club | Team | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Emiliano Martínez | 7 | 7 | 690 | 0 | 0 | 0 | 0 | NaN | ... | 0.0 | 0.0 | 0 | 2.0 | 0.0 | GK | 29 | eng | Aston Villa | Argentina |
| 1 | 1 | Lionel Messi | 7 | 7 | 690 | 7 | 3 | 27 | 13 | 0.11 | ... | 1.0 | 0.0 | 0 | 2.0 | 7.0 | FW | 34 | fr | Paris S-G | Argentina |
| 2 | 2 | Nicolás Otamendi | 7 | 7 | 690 | 0 | 1 | 1 | 0 | 0.00 | ... | 0.0 | 1.0 | 0 | 21.0 | 13.0 | DF | 34 | pt | Benfica | Argentina |
| 3 | 3 | Rodrigo De Paul | 7 | 7 | 599 | 0 | 0 | 7 | 3 | 0.00 | ... | 0.0 | 0.0 | 0 | 2.0 | 3.0 | MF | 28 | es | Atlético Madrid | Argentina |
| 4 | 4 | Nahuel Molina | 7 | 6 | 567 | 1 | 1 | 2 | 1 | 0.50 | ... | 0.0 | 0.0 | 0 | 0.0 | 6.0 | DF | 24 | es | Atlético Madrid | Argentina |
5 rows × 42 columns
# Some exploration into the data
df.shape
(680, 42)
So we can see that this dataset have 680 rows and 42 columns
df.describe()
| Unnamed: 0 | Match_played | Starts | Minutes_played | Gls | Ast | Shots | shots_on_target | goals_per_shot | goals_per_shot_on_target | ... | fouls_commited | fouls_drawn | offside | crosses | penalty_kick_won | penalty_kick_conceded | own_goal | aerial_duo_won | aerial_duo_lost | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 431.000000 | 247.000000 | ... | 680.000000 | 680.000000 | 680.000000 | 680.000000 | 677.000000 | 677.000000 | 680.000000 | 677.000000 | 677.000000 | 680.000000 |
| mean | 339.500000 | 2.933824 | 2.070588 | 191.194118 | 0.250000 | 0.177941 | 2.107353 | 0.705882 | 0.097448 | 0.302794 | ... | 2.351471 | 2.238235 | 0.373529 | 3.201471 | 0.028065 | 0.033973 | 0.002941 | 2.553914 | 2.550960 | 26.586765 |
| std | 196.443376 | 1.520611 | 1.753856 | 147.774808 | 0.698726 | 0.487652 | 3.031781 | 1.337780 | 0.215148 | 0.386570 | ... | 2.614393 | 2.891103 | 0.864215 | 5.562789 | 0.165281 | 0.181295 | 0.054193 | 3.339578 | 2.904162 | 4.161129 |
| min | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 |
| 25% | 169.750000 | 2.000000 | 1.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 24.000000 |
| 50% | 339.500000 | 3.000000 | 2.000000 | 173.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 2.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2.000000 | 26.000000 |
| 75% | 509.250000 | 4.000000 | 3.000000 | 270.000000 | 0.000000 | 0.000000 | 3.000000 | 1.000000 | 0.095000 | 0.500000 | ... | 3.000000 | 3.000000 | 0.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 4.000000 | 29.000000 |
| max | 679.000000 | 7.000000 | 7.000000 | 690.000000 | 8.000000 | 3.000000 | 29.000000 | 13.000000 | 1.000000 | 1.000000 | ... | 17.000000 | 22.000000 | 7.000000 | 40.000000 | 1.000000 | 1.000000 | 1.000000 | 21.000000 | 17.000000 | 39.000000 |
8 rows × 37 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 680 entries, 0 to 679 Data columns (total 42 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 680 non-null int64 1 Player 680 non-null object 2 Match_played 680 non-null int64 3 Starts 680 non-null int64 4 Minutes_played 680 non-null int64 5 Gls 680 non-null int64 6 Ast 680 non-null int64 7 Shots 680 non-null int64 8 shots_on_target 680 non-null int64 9 goals_per_shot 431 non-null float64 10 goals_per_shot_on_target 247 non-null float64 11 non_penalty_goals 680 non-null int64 12 penalty_goal 680 non-null int64 13 CrdY 680 non-null int64 14 CrdR 680 non-null int64 15 pass_comp 677 non-null float64 16 pass_attempt 677 non-null float64 17 pass_comp% 674 non-null float64 18 tackles_made 677 non-null float64 19 successful_tackles 680 non-null int64 20 Blocks 677 non-null float64 21 interceptions 680 non-null int64 22 clearence 677 non-null float64 23 Err 677 non-null float64 24 Touches 677 non-null float64 25 successful_dribbles 677 non-null float64 26 dribbles_attempted 677 non-null float64 27 successful_dribbles% 435 non-null float64 28 fouls_commited 680 non-null int64 29 fouls_drawn 680 non-null int64 30 offside 680 non-null int64 31 crosses 680 non-null int64 32 penalty_kick_won 677 non-null float64 33 penalty_kick_conceded 677 non-null float64 34 own_goal 680 non-null int64 35 aerial_duo_won 677 non-null float64 36 aerial_duo_lost 677 non-null float64 37 Pos 680 non-null object 38 Age 680 non-null int64 39 Club_country 679 non-null object 40 Club 679 non-null object 41 Team 680 non-null object dtypes: float64(17), int64(20), object(5) memory usage: 223.2+ KB
df['Team'].unique()
array(['Argentina', 'France', 'Croatia', 'Morocco', 'Netherlands',
'England', 'Brazil', 'Portugal', 'Japan', 'Senegal', 'Australia',
'Switzerland', 'Spain', 'United States', 'Poland',
'Korea Republic', 'Germany', 'Ecuador', 'Cameroon', 'Uruguay',
'Tunisia', 'Mexico', 'Belgium', 'Ghana', 'Saudi Arabia', 'IR Iran',
'Costa Rica', 'Denmark', 'Serbia', 'Wales', 'Canada', 'Qatar'],
dtype=object)
number_of_teams = len(df['Team'].unique())
print("The total numbers of country which took part in the FIFA 2022 world cup are:", number_of_teams)
The total numbers of country which took part in the FIFA 2022 world cup are: 32
df.isnull().sum()
Unnamed: 0 0 Player 0 Match_played 0 Starts 0 Minutes_played 0 Gls 0 Ast 0 Shots 0 shots_on_target 0 goals_per_shot 249 goals_per_shot_on_target 433 non_penalty_goals 0 penalty_goal 0 CrdY 0 CrdR 0 pass_comp 3 pass_attempt 3 pass_comp% 6 tackles_made 3 successful_tackles 0 Blocks 3 interceptions 0 clearence 3 Err 3 Touches 3 successful_dribbles 3 dribbles_attempted 3 successful_dribbles% 245 fouls_commited 0 fouls_drawn 0 offside 0 crosses 0 penalty_kick_won 3 penalty_kick_conceded 3 own_goal 0 aerial_duo_won 3 aerial_duo_lost 3 Pos 0 Age 0 Club_country 1 Club 1 Team 0 dtype: int64
So we saw that there are some null values in various columns but it is common because in football players play as per their positions and roles in the field, so it is common that a player from a specific postion will not show all attributes as for Example player "Nicolás Otamendi" is a player in the defense (DF) position, so his goals (Gls) are showing 0 and it is normal for a player not to score goals but to prevent goals and that's why his blocks, clearances are showing values.
# Let's explore some intitial correalation between features
sns.heatmap(df.corr())
<AxesSubplot:>
df.corr()
| Unnamed: 0 | Match_played | Starts | Minutes_played | Gls | Ast | Shots | shots_on_target | goals_per_shot | goals_per_shot_on_target | ... | fouls_commited | fouls_drawn | offside | crosses | penalty_kick_won | penalty_kick_conceded | own_goal | aerial_duo_won | aerial_duo_lost | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 1.000000 | -0.490549 | -0.309573 | -0.368539 | -0.202565 | -0.225942 | -0.242663 | -0.232980 | -0.126007 | -0.122046 | ... | -0.250052 | -0.238431 | -0.122613 | -0.146816 | -0.094811 | -0.064572 | -0.082036 | -0.174930 | -0.195327 | 0.050901 |
| Match_played | -0.490549 | 1.000000 | 0.778739 | 0.848660 | 0.326088 | 0.339639 | 0.486162 | 0.403811 | 0.116687 | 0.105231 | ... | 0.529319 | 0.511456 | 0.309100 | 0.352233 | 0.142118 | 0.157687 | 0.091725 | 0.418102 | 0.469373 | 0.057818 |
| Starts | -0.309573 | 0.778739 | 1.000000 | 0.965453 | 0.269201 | 0.309023 | 0.415141 | 0.334010 | 0.076403 | 0.078123 | ... | 0.520052 | 0.509324 | 0.261445 | 0.373811 | 0.130318 | 0.191804 | 0.075288 | 0.486916 | 0.448455 | 0.147282 |
| Minutes_played | -0.368539 | 0.848660 | 0.965453 | 1.000000 | 0.280319 | 0.328026 | 0.429074 | 0.342593 | 0.069960 | 0.086830 | ... | 0.546712 | 0.520005 | 0.250981 | 0.375959 | 0.126929 | 0.203570 | 0.098317 | 0.506957 | 0.463340 | 0.136834 |
| Gls | -0.202565 | 0.326088 | 0.269201 | 0.280319 | 1.000000 | 0.314446 | 0.692270 | 0.792513 | 0.526255 | 0.556852 | ... | 0.227555 | 0.340104 | 0.498764 | 0.207545 | 0.322541 | -0.020694 | 0.019447 | 0.159343 | 0.313837 | 0.005192 |
| Ast | -0.225942 | 0.339639 | 0.309023 | 0.328026 | 0.314446 | 1.000000 | 0.413410 | 0.360277 | 0.082697 | 0.082227 | ... | 0.207322 | 0.312522 | 0.226461 | 0.445524 | 0.085024 | 0.048916 | 0.035896 | 0.169448 | 0.240142 | 0.037016 |
| Shots | -0.242663 | 0.486162 | 0.415141 | 0.429074 | 0.692270 | 0.413410 | 1.000000 | 0.858581 | 0.044147 | -0.024020 | ... | 0.344178 | 0.497450 | 0.568691 | 0.388710 | 0.244262 | -0.017590 | 0.042894 | 0.242734 | 0.410471 | 0.002004 |
| shots_on_target | -0.232980 | 0.403811 | 0.334010 | 0.342593 | 0.792513 | 0.360277 | 0.858581 | 1.000000 | 0.260415 | 0.051345 | ... | 0.284780 | 0.456810 | 0.524458 | 0.342233 | 0.258002 | -0.043891 | 0.032264 | 0.163289 | 0.348779 | -0.023982 |
| goals_per_shot | -0.126007 | 0.116687 | 0.076403 | 0.069960 | 0.526255 | 0.082697 | 0.044147 | 0.260415 | 1.000000 | 0.853405 | ... | 0.071507 | 0.095503 | 0.107968 | -0.030793 | 0.056041 | 0.017345 | -0.010310 | 0.079189 | 0.135264 | -0.020073 |
| goals_per_shot_on_target | -0.122046 | 0.105231 | 0.078123 | 0.086830 | 0.556852 | 0.082227 | -0.024020 | 0.051345 | 0.853405 | 1.000000 | ... | 0.039211 | -0.005219 | 0.052264 | -0.072580 | 0.022076 | 0.079855 | 0.004496 | 0.119314 | 0.117469 | 0.029050 |
| non_penalty_goals | -0.205807 | 0.326183 | 0.253526 | 0.261712 | 0.957893 | 0.260537 | 0.635841 | 0.761241 | 0.608908 | 0.646610 | ... | 0.221725 | 0.267833 | 0.470926 | 0.178294 | 0.248661 | -0.016256 | 0.024905 | 0.168142 | 0.303858 | -0.026405 |
| penalty_goal | -0.084916 | 0.151189 | 0.169368 | 0.182824 | 0.583556 | 0.298520 | 0.481133 | 0.456541 | 0.016825 | 0.027556 | ... | 0.122181 | 0.362369 | 0.310391 | 0.179130 | 0.358692 | -0.022159 | -0.006403 | 0.049122 | 0.173970 | 0.091776 |
| CrdY | -0.015626 | 0.195284 | 0.231851 | 0.240299 | -0.001865 | 0.036258 | 0.043610 | -0.001031 | -0.017840 | 0.027346 | ... | 0.395270 | 0.187620 | -0.025381 | 0.024578 | 0.058323 | 0.163839 | 0.016123 | 0.261886 | 0.140532 | 0.063494 |
| CrdR | -0.013421 | 0.003350 | -0.003098 | 0.012401 | 0.055086 | 0.090304 | 0.003622 | 0.031310 | 0.134753 | 0.072968 | ... | 0.122152 | 0.000313 | 0.055803 | -0.016626 | -0.013100 | 0.091932 | -0.004178 | 0.021857 | 0.031853 | 0.040019 |
| pass_comp | -0.337401 | 0.637517 | 0.748612 | 0.794798 | 0.068449 | 0.227109 | 0.212047 | 0.141490 | -0.049060 | -0.016957 | ... | 0.471239 | 0.385063 | 0.009464 | 0.272480 | 0.047728 | 0.193699 | 0.116189 | 0.472333 | 0.285711 | 0.102591 |
| pass_attempt | -0.341138 | 0.675461 | 0.791300 | 0.837237 | 0.092865 | 0.259407 | 0.246132 | 0.171911 | -0.037637 | -0.009599 | ... | 0.497775 | 0.416231 | 0.037927 | 0.333904 | 0.058868 | 0.196556 | 0.113158 | 0.488611 | 0.312131 | 0.113512 |
| pass_comp% | -0.130021 | 0.085859 | 0.151147 | 0.161234 | -0.091741 | -0.008618 | -0.060513 | -0.092056 | -0.109049 | -0.067914 | ... | 0.062353 | 0.021789 | -0.132147 | -0.108529 | -0.039181 | 0.055401 | 0.044067 | 0.051754 | -0.075159 | -0.014793 |
| tackles_made | -0.257068 | 0.543468 | 0.569544 | 0.594768 | 0.041464 | 0.214729 | 0.185477 | 0.134276 | 0.007901 | -0.024909 | ... | 0.537500 | 0.453978 | 0.023657 | 0.333281 | 0.062889 | 0.100473 | 0.152503 | 0.328125 | 0.262341 | -0.034588 |
| successful_tackles | -0.248619 | 0.513188 | 0.521488 | 0.549360 | 0.040102 | 0.220294 | 0.165689 | 0.126689 | 0.008717 | -0.024014 | ... | 0.504422 | 0.424772 | -0.005760 | 0.290894 | 0.063049 | 0.099269 | 0.157435 | 0.286430 | 0.249880 | -0.049538 |
| Blocks | -0.217767 | 0.524557 | 0.574308 | 0.590212 | 0.138760 | 0.236160 | 0.271825 | 0.209146 | 0.070606 | 0.091623 | ... | 0.496361 | 0.377945 | 0.120288 | 0.311087 | 0.097089 | 0.045647 | 0.066120 | 0.417858 | 0.404824 | -0.052795 |
| interceptions | -0.203893 | 0.461795 | 0.526662 | 0.556883 | -0.010884 | 0.172832 | 0.105343 | 0.037981 | 0.022267 | 0.030630 | ... | 0.483546 | 0.306065 | -0.004548 | 0.213401 | 0.009063 | 0.163685 | 0.050412 | 0.393334 | 0.253199 | 0.023618 |
| clearence | -0.154188 | 0.397278 | 0.513323 | 0.544387 | -0.020366 | 0.082782 | -0.013595 | -0.062684 | 0.053994 | 0.152817 | ... | 0.320596 | 0.102569 | -0.040836 | 0.026026 | -0.027512 | 0.137042 | 0.064533 | 0.552747 | 0.313761 | 0.097433 |
| Err | 0.030215 | 0.074056 | 0.142806 | 0.150960 | -0.027723 | -0.027436 | -0.026400 | -0.035104 | 0.015221 | 0.042957 | ... | -0.016883 | -0.034164 | 0.001924 | -0.032978 | -0.005918 | 0.047486 | -0.012502 | 0.012676 | -0.041721 | 0.086336 |
| Touches | -0.351865 | 0.715367 | 0.823268 | 0.869127 | 0.140083 | 0.290248 | 0.307929 | 0.226339 | -0.020882 | -0.000032 | ... | 0.540109 | 0.465694 | 0.083339 | 0.367966 | 0.079906 | 0.193219 | 0.112844 | 0.510811 | 0.359277 | 0.098973 |
| successful_dribbles | -0.168322 | 0.382079 | 0.356748 | 0.365697 | 0.413883 | 0.340259 | 0.572614 | 0.479053 | 0.001031 | -0.044802 | ... | 0.271066 | 0.503152 | 0.387890 | 0.443577 | 0.192038 | -0.018780 | 0.010131 | 0.069986 | 0.192382 | -0.068166 |
| dribbles_attempted | -0.176865 | 0.432040 | 0.388740 | 0.402446 | 0.445515 | 0.379736 | 0.631225 | 0.555133 | 0.022006 | -0.054264 | ... | 0.336980 | 0.602395 | 0.442160 | 0.531044 | 0.216382 | -0.021852 | 0.000903 | 0.062195 | 0.249767 | -0.111055 |
| successful_dribbles% | 0.024714 | -0.020488 | 0.044061 | 0.031018 | -0.034576 | -0.023019 | -0.019713 | -0.061044 | -0.053311 | -0.010854 | ... | -0.038616 | -0.091477 | -0.066925 | -0.051810 | -0.042086 | 0.004206 | 0.015685 | 0.063424 | -0.040089 | 0.165479 |
| fouls_commited | -0.250052 | 0.529319 | 0.520052 | 0.546712 | 0.227555 | 0.207322 | 0.344178 | 0.284780 | 0.071507 | 0.039211 | ... | 1.000000 | 0.451670 | 0.193417 | 0.285152 | 0.188675 | 0.183119 | 0.044667 | 0.393174 | 0.435137 | -0.037667 |
| fouls_drawn | -0.238431 | 0.511456 | 0.509324 | 0.520005 | 0.340104 | 0.312522 | 0.497450 | 0.456810 | 0.095503 | -0.005219 | ... | 0.451670 | 1.000000 | 0.286759 | 0.401862 | 0.273809 | 0.043732 | 0.014321 | 0.261862 | 0.412019 | -0.014820 |
| offside | -0.122613 | 0.309100 | 0.261445 | 0.250981 | 0.498764 | 0.226461 | 0.568691 | 0.524458 | 0.107968 | 0.052264 | ... | 0.193417 | 0.286759 | 1.000000 | 0.282094 | 0.267811 | -0.015039 | 0.007954 | 0.167785 | 0.274592 | 0.029881 |
| crosses | -0.146816 | 0.352233 | 0.373811 | 0.375959 | 0.207545 | 0.445524 | 0.388710 | 0.342233 | -0.030793 | -0.072580 | ... | 0.285152 | 0.401862 | 0.282094 | 1.000000 | 0.131982 | -0.018590 | -0.021510 | 0.039812 | 0.169380 | 0.016454 |
| penalty_kick_won | -0.094811 | 0.142118 | 0.130318 | 0.126929 | 0.322541 | 0.085024 | 0.244262 | 0.258002 | 0.056041 | 0.022076 | ... | 0.188675 | 0.273809 | 0.267811 | 0.131982 | 1.000000 | -0.031867 | -0.009250 | 0.097756 | 0.180386 | 0.058036 |
| penalty_kick_conceded | -0.064572 | 0.157687 | 0.191804 | 0.203570 | -0.020694 | 0.048916 | -0.017590 | -0.043891 | 0.017345 | 0.079855 | ... | 0.183119 | 0.043732 | -0.015039 | -0.018590 | -0.031867 | 1.000000 | -0.010208 | 0.142347 | 0.073971 | 0.040507 |
| own_goal | -0.082036 | 0.091725 | 0.075288 | 0.098317 | 0.019447 | 0.035896 | 0.042894 | 0.032264 | -0.010310 | 0.004496 | ... | 0.044667 | 0.014321 | 0.007954 | -0.021510 | -0.009250 | -0.010208 | 1.000000 | 0.072522 | -0.000956 | -0.040319 |
| aerial_duo_won | -0.174930 | 0.418102 | 0.486916 | 0.506957 | 0.159343 | 0.169448 | 0.242734 | 0.163289 | 0.079189 | 0.119314 | ... | 0.393174 | 0.261862 | 0.167785 | 0.039812 | 0.097756 | 0.142347 | 0.072522 | 1.000000 | 0.617175 | 0.133261 |
| aerial_duo_lost | -0.195327 | 0.469373 | 0.448455 | 0.463340 | 0.313837 | 0.240142 | 0.410471 | 0.348779 | 0.135264 | 0.117469 | ... | 0.435137 | 0.412019 | 0.274592 | 0.169380 | 0.180386 | 0.073971 | -0.000956 | 0.617175 | 1.000000 | 0.062759 |
| Age | 0.050901 | 0.057818 | 0.147282 | 0.136834 | 0.005192 | 0.037016 | 0.002004 | -0.023982 | -0.020073 | 0.029050 | ... | -0.037667 | -0.014820 | 0.029881 | 0.016454 | 0.058036 | 0.040507 | -0.040319 | 0.133261 | 0.062759 | 1.000000 |
37 rows × 37 columns
# Let's find out which players have scored most goals
def visualize_top_players(df, metric, position, title, ylabel, save_as=None):
plt.figure(figsize=(12, 6))
top_10_players = df[df['Pos'] == position].nlargest(10, metric).sort_values(metric, ascending=False)
sns.barplot(x=metric, y='Player', data=top_10_players, palette='viridis')
plt.title(title)
plt.xlabel(ylabel)
plt.ylabel('Player')
plt.tight_layout()
if save_as:
plt.savefig(save_as, dpi=300)
plt.show()
visualize_top_players(df, 'Gls', 'FW', 'Top 10 Forwards by Goals Scored', 'Goals Scored')
# We want to find out players based on "Goals", "Assists", "Shots", "Passes Completed", "Touches", "Successful Dribbles"
# First selecting top 10 players based on the required attributes
top_players = df.nlargest(10, ["Gls", "Ast", "Shots", "pass_comp", "Touches", "successful_dribbles"])
# Normalized the data
normalized_top_players = top_players[["Gls", "Ast", "Shots", "pass_comp", "Touches", "successful_dribbles"]].apply(lambda x: x / x.max(), axis=0)
# Created a radar chart as below:
attributes = ["Goals", "Assists", "Shots", "Passes Completed", "Touches", "Successful Dribbles"]
num_vars = len(attributes)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
plt.xticks(angles[:-1], attributes, color="grey", size=12)
ax.set_rlabel_position(30)
plt.yticks([0.25, 0.5, 0.75, 1], ["25%", "50%", "75%", "100%"], color="grey", size=10)
plt.ylim(0, 1)
for index, row in normalized_top_players.iterrows():
stats = row.tolist()
stats += stats[:1]
ax.plot(angles, stats, linewidth=2, label=top_players.loc[index, "Player"])
ax.fill(angles, stats, alpha=0.25)
plt.legend(loc="center left", bbox_to_anchor=(1.15, 0.5), ncol=1)
plt.title("Top 10 Players")
plt.show()
top_players = top_players.reset_index()
top_players = top_players.drop(columns = 'index')
top_players = top_players.drop(columns = 'Unnamed: 0')
top_players = top_players[["Player", "Gls", "Ast", "Shots", "pass_comp", "Touches", "successful_dribbles"]]
top_players
| Player | Gls | Ast | Shots | pass_comp | Touches | successful_dribbles | |
|---|---|---|---|---|---|---|---|
| 0 | Kylian Mbappé | 8 | 2 | 29 | 173.0 | 322.0 | 25.0 |
| 1 | Lionel Messi | 7 | 3 | 27 | 300.0 | 462.0 | 15.0 |
| 2 | Olivier Giroud | 4 | 0 | 16 | 48.0 | 111.0 | 0.0 |
| 3 | Julián Álvarez | 4 | 0 | 11 | 93.0 | 176.0 | 0.0 |
| 4 | Álvaro Morata | 3 | 1 | 8 | 28.0 | 60.0 | 1.0 |
| 5 | Gonçalo Ramos | 3 | 1 | 7 | 25.0 | 56.0 | 0.0 |
| 6 | Marcus Rashford | 3 | 0 | 11 | 49.0 | 92.0 | 5.0 |
| 7 | Enner Valencia | 3 | 0 | 8 | 40.0 | 99.0 | 2.0 |
| 8 | Richarlison | 3 | 0 | 8 | 40.0 | 99.0 | 0.0 |
| 9 | Bukayo Saka | 3 | 0 | 7 | 84.0 | 150.0 | 2.0 |
# Now I would like to find out the contribution of Lionel Messi to Team Argentina based on the same metrics taken above
df_messi = df[(df['Player'] == 'Lionel Messi') & (df['Team'] == 'Argentina')]
metrics = ['Gls', 'Ast', 'Shots', 'pass_comp', 'Touches', 'successful_dribbles']
values = df_messi[metrics].values[0]
# Calculated the overall performance of Argentina's team in the selected metrics
df_team = df[df['Team'] == 'Argentina']
total_values = df_team[metrics].sum().values
max_values = df[metrics].max().values
# Created the radar chart as below:
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=total_values,
theta=metrics,
fill='toself',
name='Argentina',
line_color='blue'
))
fig.add_trace(go.Scatterpolar(
r=values,
theta=metrics,
fill='toself',
name='Lionel Messi',
line_color='red'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, max_values.max()]
)
),
showlegend=True,
title={
'text': 'Lionel Messi Contribution to Argentina Team Performance',
'y': 0.95,
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top'
},
margin=dict(l=50, r=50, t=100, b=50)
)
fig.show()
# And now I would like to find out the contribution of Kylian Mbappe to Team France
df_mbappe = df[(df['Player'] == 'Kylian Mbappé') & (df['Team'] == 'France')]
metrics = ['Gls', 'Ast', 'Shots', 'pass_comp', 'Touches', 'successful_dribbles']
values = df_mbappe[metrics].values[0]
# Calculated the overall performance of France's team in the selected metrics
df_team = df[df['Team'] == 'France']
total_values = df_team[metrics].sum().values
max_values = df[metrics].max().values
# Created the radar chart as below:
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
r=total_values,
theta=metrics,
fill='toself',
name='France',
line_color='blue'
))
fig.add_trace(go.Scatterpolar(
r=values,
theta=metrics,
fill='toself',
name='Kylian Mbappé',
line_color='red'
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, max_values.max()]
)
),
showlegend=True,
title={
'text': 'Kylian Mbappé Contribution to France Team Performance',
'y': 0.95,
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top'
},
margin=dict(l=50, r=50, t=100, b=50)
)
fig.show()
# Now I want to find the the top 10 midfield players of the FIFA World cup 2022
midfielders = df[df['Pos'] == 'MF']
# Selected the top 10 midfielders based on the sum of the chosen attributes
top_midfielders = midfielders.nlargest(10, ['Ast', 'pass_comp', 'successful_tackles', 'interceptions', 'Touches', 'crosses', 'aerial_duo_won'], keep='first')
columns = ['Player', 'Ast', 'pass_comp', 'successful_tackles', 'interceptions', 'Touches', 'crosses', 'aerial_duo_won']
top_midfielders = top_midfielders[columns]
# Created a custom normalization function
def custom_normalize(df):
result = df.copy()
for column in df.columns:
if column != 'Player':
min_val = df[column].min()
result[column] = df[column] - min_val
return result
# Normalized the data with the custom function
normalized_top_midfielders = custom_normalize(top_midfielders)
# Created the parallel coordinates plot in below:
fig, ax = plt.subplots(figsize=(15, 10))
parallel_coordinates(normalized_top_midfielders, 'Player', ax=ax, colormap='Set1')
# Customized the plot
ax.set_xticklabels(columns[1:])
ax.set_title('Top 10 Midfielders')
ax.legend().set_title('Player Names')
y_tick_interval = 20
y_min = np.floor(normalized_top_midfielders.iloc[:, 1:].min().min() / y_tick_interval) * y_tick_interval
y_max = np.ceil(normalized_top_midfielders.iloc[:, 1:].max().max() / y_tick_interval) * y_tick_interval
ax.set_ylim(y_min, y_max)
ax.yaxis.set_ticks(np.arange(y_min, y_max + y_tick_interval, y_tick_interval))
plt.show()
top_midfielders = top_midfielders.reset_index()
top_midfielders = top_midfielders.drop(columns = 'index')
top_midfielders
| Player | Ast | pass_comp | successful_tackles | interceptions | Touches | crosses | aerial_duo_won | |
|---|---|---|---|---|---|---|---|---|
| 0 | Antoine Griezmann | 3 | 258.0 | 8 | 8 | 384.0 | 40 | 5.0 |
| 1 | Enzo Fernández | 1 | 412.0 | 15 | 3 | 535.0 | 2 | 9.0 |
| 2 | Frenkie de Jong | 1 | 323.0 | 5 | 6 | 412.0 | 3 | 8.0 |
| 3 | Adrien Rabiot | 1 | 237.0 | 9 | 10 | 352.0 | 7 | 15.0 |
| 4 | Jude Bellingham | 1 | 235.0 | 15 | 5 | 324.0 | 4 | 6.0 |
| 5 | Alexis Mac Allister | 1 | 234.0 | 4 | 3 | 332.0 | 1 | 5.0 |
| 6 | Lucas Paquetá | 1 | 169.0 | 5 | 2 | 231.0 | 4 | 3.0 |
| 7 | Andre-Frank Zambo Anguissa | 1 | 113.0 | 0 | 5 | 158.0 | 2 | 4.0 |
| 8 | Aïssa Laïdouni | 1 | 99.0 | 3 | 4 | 157.0 | 3 | 3.0 |
| 9 | Steven Berghuis | 1 | 97.0 | 1 | 0 | 145.0 | 13 | 1.0 |
# Now let's find out Top 10 defenders based on their defensive attributes
defenders = df[df['Pos'] == 'DF']
top_defenders = defenders.nlargest(10, ['successful_tackles', 'interceptions', 'Blocks', 'clearence', 'aerial_duo_won'], keep='all')
# Created a grouped bar chart below:
labels = top_defenders['Player']
x = np.arange(len(labels))
width = 0.15
fig, ax = plt.subplots(figsize=(15, 8))
rects1 = ax.bar(x - 2 * width, top_defenders['successful_tackles'], width, label='Successful Tackles')
rects2 = ax.bar(x - width, top_defenders['interceptions'], width, label='Interceptions')
rects3 = ax.bar(x, top_defenders['Blocks'], width, label='Blocks')
rects4 = ax.bar(x + width, top_defenders['clearence'], width, label='Clearance')
rects5 = ax.bar(x + 2 * width, top_defenders['aerial_duo_won'], width, label='Aerial Duels Won')
ax.set_ylabel('Count')
ax.set_title('Top 10 Defenders in the World Cup')
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.legend()
plt.show()
top_defenders = top_defenders.drop(columns = 'Unnamed: 0')
top_defenders = top_defenders.reset_index()
top_defenders = top_defenders.drop(columns = 'index')
top_defenders = top_defenders[["Player", "successful_tackles", "interceptions", "Blocks", "clearence", "aerial_duo_won"]]
top_defenders
| Player | successful_tackles | interceptions | Blocks | clearence | aerial_duo_won | |
|---|---|---|---|---|---|---|
| 0 | Achraf Hakimi | 17 | 9 | 7.0 | 12.0 | 5.0 |
| 1 | Theo Hernández | 13 | 4 | 5.0 | 8.0 | 8.0 |
| 2 | Daley Blind | 9 | 5 | 6.0 | 4.0 | 5.0 |
| 3 | Eduardo Camavinga | 9 | 3 | 4.0 | 6.0 | 2.0 |
| 4 | Kalidou Koulibaly | 9 | 2 | 4.0 | 30.0 | 11.0 |
| 5 | Noussair Mazraoui | 9 | 1 | 6.0 | 16.0 | 1.0 |
| 6 | Jules Koundé | 8 | 5 | 7.0 | 8.0 | 5.0 |
| 7 | Josip Juranović | 8 | 3 | 5.0 | 15.0 | 2.0 |
| 8 | Marcos Acuña | 8 | 2 | 8.0 | 9.0 | 5.0 |
| 9 | Nicolás Tagliafico | 7 | 9 | 9.0 | 9.0 | 4.0 |
# Let's find out top 10 Goalkeepers
goalkeepers = df[df['Pos'] == 'GK']
top_goalkeepers = goalkeepers.nlargest(10, ['Minutes_played', 'pass_comp%', 'Err', 'aerial_duo_won', 'CrdY', 'CrdR'], keep='all')
# Melted the data frame to a long format for pointplot
melted_data = pd.melt(top_goalkeepers, id_vars=['Player'], value_vars=['Minutes_played', 'Err', 'aerial_duo_won', 'CrdY', 'CrdR', 'pass_comp%'])
# Created a pointplot below:
plt.figure(figsize=(12, 6))
sns.set(style="whitegrid")
ax = sns.pointplot(data=melted_data, x='Player', y='value', hue='variable', palette="deep", linestyles="--", dodge=True)
for line, name in zip(ax.lines, melted_data['variable'].unique()):
y = line.get_ydata()
ax.fill_between(x=line.get_xdata(), y1=y - np.std(y), y2=y + np.std(y), alpha=0.2)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_ylabel('Attribute values')
ax.set_xlabel('Goalkeepers')
ax.set_title('Top 10 Goalkeepers in the World Cup')
def onclick(event):
if event.inaxes == ax:
for point in ax.collections:
if point.contains(event)[0]:
index = point.get_offsets()[point.contains(event)[1]]['ind'][0]
x = point.get_offsets()[0][0]
y = point.get_offsets()[0][1]
player = ax.get_xticklabels()[int(x)].get_text()
attribute = melted_data['variable'].unique()[point.get_array().index(point.get_sizes()[index])]
value = y
print(f"Player: {player}, Attribute: {attribute}, Value: {value}")
plt.gcf().canvas.mpl_connect('button_press_event', onclick)
plt.show()
# Created a box plot:
plt.figure(figsize = (12,12))
sns.boxplot(x = 'Age', y = 'Team', data = df)
plt.xticks(rotation = 90)
(array([15., 20., 25., 30., 35., 40., 45.]), [Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, ''), Text(0, 0, '')])
# Now I wanted to group the data by "Team" and "Age", and count the number of players in each group
age_distribution = df.groupby(['Team', 'Age'])['Player'].count().reset_index(name='Count')
# Created a bar chart of the age distribution for each team
fig, ax = plt.subplots(figsize=(10, 6))
for team in age_distribution['Team'].unique():
team_data = age_distribution[age_distribution['Team'] == team]
ax.bar(team_data['Age'], team_data['Count'], label=team)
ax.set_title('Age Distribution of Players in Different Teams')
ax.set_xlabel('Age')
ax.set_ylabel('Number of Players')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
age_distribution
| Team | Age | Count | |
|---|---|---|---|
| 0 | Argentina | 21 | 2 |
| 1 | Argentina | 22 | 1 |
| 2 | Argentina | 23 | 2 |
| 3 | Argentina | 24 | 5 |
| 4 | Argentina | 25 | 1 |
| ... | ... | ... | ... |
| 364 | Wales | 28 | 1 |
| 365 | Wales | 29 | 2 |
| 366 | Wales | 31 | 1 |
| 367 | Wales | 32 | 2 |
| 368 | Wales | 35 | 1 |
369 rows × 3 columns
# Finding out the top 10 countries with the most players in the youngest age group
age_younger = age_distribution[age_distribution['Age'] == age_distribution['Age'].min()].sort_values('Count', ascending=False).head(10)
age_younger
| Team | Age | Count | |
|---|---|---|---|
| 10 | Australia | 17 | 1 |
| 66 | Costa Rica | 17 | 1 |
| 141 | Germany | 17 | 1 |
| 299 | Spain | 17 | 1 |
# Grouped the data by "Team" and calculate the mean age for each group
team_mean_age = df.groupby('Team')['Age'].mean()
# Sorted the mean age by ascending order and select the top 10 teams
top_10_youngest_teams = team_mean_age.sort_values().head(10)
print(top_10_youngest_teams)
Team United States 24.100000 Ecuador 24.277778 Ghana 24.950000 Spain 25.523810 Senegal 25.800000 Morocco 25.840000 France 25.875000 Portugal 25.958333 Netherlands 26.047619 Wales 26.055556 Name: Age, dtype: float64
# Similarly finding out the oldest group
top_10_oldest_teams = team_mean_age.sort_values(ascending = False).head(10)
print(top_10_oldest_teams)
Team IR Iran 28.047619 Belgium 27.950000 Mexico 27.666667 Uruguay 27.631579 Qatar 27.600000 Brazil 27.500000 Canada 27.368421 Korea Republic 27.333333 Croatia 27.238095 Saudi Arabia 27.130435 Name: Age, dtype: float64
youngest_teams_df = pd.DataFrame({'Team': top_10_youngest_teams.index, 'Age': top_10_youngest_teams.values})
youngest_teams_df
| Team | Age | |
|---|---|---|
| 0 | United States | 24.100000 |
| 1 | Ecuador | 24.277778 |
| 2 | Ghana | 24.950000 |
| 3 | Spain | 25.523810 |
| 4 | Senegal | 25.800000 |
| 5 | Morocco | 25.840000 |
| 6 | France | 25.875000 |
| 7 | Portugal | 25.958333 |
| 8 | Netherlands | 26.047619 |
| 9 | Wales | 26.055556 |
oldest_teams_df = pd.DataFrame({'Team': top_10_oldest_teams.index, 'Age': top_10_oldest_teams.values})
oldest_teams_df
| Team | Age | |
|---|---|---|
| 0 | IR Iran | 28.047619 |
| 1 | Belgium | 27.950000 |
| 2 | Mexico | 27.666667 |
| 3 | Uruguay | 27.631579 |
| 4 | Qatar | 27.600000 |
| 5 | Brazil | 27.500000 |
| 6 | Canada | 27.368421 |
| 7 | Korea Republic | 27.333333 |
| 8 | Croatia | 27.238095 |
| 9 | Saudi Arabia | 27.130435 |
# The total number of goals scored by the 10 youngest teams
youngest_teams_goals = df[df['Team'].isin(top_10_youngest_teams.index)].groupby('Team')['Gls'].sum().reset_index(name='Total Goals')
# The total number of goals scored by the 10 oldest teams is:
oldest_teams_goals = df[df['Team'].isin(top_10_oldest_teams.index)].groupby('Team')['Gls'].sum().reset_index(name='Total Goals')
# Now Calculating the total Shots by the top 10 youngest teams
youngest_teams_shots = df[df['Team'].isin(top_10_youngest_teams.index)].groupby('Team')['Shots'].sum().reset_index(name = 'Total Shots')
# Now Calculating the total Shots by the top 10 youngest teams
oldest_teams_shots = df[df['Team'].isin(top_10_oldest_teams.index)].groupby('Team')['Shots'].sum().reset_index(name = 'Total Shots')
# Created a 3x2 grid of subplots
fig = make_subplots(rows=3, cols=2, subplot_titles=('10 Youngest Teams by Mean Age', '10 Oldest Teams by Mean Age',
'Total Goals Scored by 10 Youngest Teams', 'Total Goals Scored by 10 Oldest Teams', 'Total Shots attemped by the top 10 youngest teams', 'Total Shots attemped by the top 10 oldest teams'))
fig.add_trace(px.bar(youngest_teams_df, x='Team', y='Age', title='10 Youngest Teams by Mean Age').data[0], row=1, col=1)
fig.update_xaxes(title_text='Team', row=1, col=1)
fig.update_yaxes(title_text='Mean Age', range=[20, 30], row=1, col=1) # set y-axis range to 20-30
fig.update_traces(hovertemplate='Team: %{x}<br>Mean Age: %{y}', selector=dict(type='bar'))
fig.update_layout(xaxis_tickangle=-45)
fig.add_trace(px.bar(oldest_teams_df, x='Team', y='Age', title='10 Oldest Teams by Mean Age').data[0], row=1, col=2)
fig.update_xaxes(title_text='Team', row=1, col=2)
fig.update_yaxes(title_text='Mean Age', range=[20, 30], row=1, col=2) # set y-axis range to 20-30
fig.update_traces(hovertemplate='Team: %{x}<br>Mean Age: %{y}', selector=dict(type='bar'))
fig.update_layout(xaxis_tickangle=-45)
fig.add_trace(px.scatter(youngest_teams_goals, x='Team', y='Total Goals', title='Total Goals Scored by 10 Youngest Teams').data[0], row=2, col=1)
fig.update_xaxes(title_text='Team', row=2, col=1)
fig.update_yaxes(title_text='Total Goals', row=2, col=1)
fig.update_traces(hovertemplate='Team: %{x}<br>Total Goals: %{y}', selector=dict(type='scatter'))
fig.add_trace(px.scatter(oldest_teams_goals, x='Team', y='Total Goals', title='Total Goals Scored by 10 Oldest Teams').data[0], row=2, col=2)
fig.update_xaxes(title_text='Team', row=2, col=2)
fig.update_yaxes(title_text='Total Goals', row=2, col=2)
fig.update_traces(hovertemplate='Team: %{x}<br>Total Goals: %{y}', selector=dict(type='scatter'))
fig.add_trace(px.scatter(youngest_teams_shots, x='Team', y='Total Shots', title='Total Shots attempted by 10 youngest Teams').data[0], row=3, col=1)
fig.update_xaxes(title_text='Team', row=3, col=1)
fig.update_yaxes(title_text='Total Shots Attempted', row=3, col=1)
fig.update_traces(hovertemplate='Team: %{x}<br>Total Shots: %{y}', selector=dict(type='scatter'))
fig.add_trace(px.scatter(oldest_teams_shots, x='Team', y='Total Shots', title='Total Shots attempted by 10 Oldest Teams').data[0], row=3, col=2)
fig.update_xaxes(title_text='Team', row=3, col=2)
fig.update_yaxes(title_text='Total Shots Attempted', row=3, col=2)
fig.update_traces(hovertemplate='Team: %{x}<br>Total Shots: %{y}', selector=dict(type='scatter'))
fig.update_layout(height=1000, width=1000, title='Subplots of Team Stats', showlegend=False)
fig.show()